
import sys
import html

if len(sys.argv) != 3:
    print("clean_text.py [inp_file] [out_file]")
    exit(0)


def strip_quote_pairs(text, quote='"'):
    if len(text) >= 2 and text[0] == quote and text[-1] == quote:
        return text[1:-1]
    else:
        return text

def normalize_string(inp_str):
    s = inp_str.lower()
    s = html.unescape(s)
    s = s.replace('\n', ' ').replace('\r', '')
    s = strip_quote_pairs(s)
    return s


inp_file = sys.argv[1]
out_file = sys.argv[2]
lines = [line.strip() for line in open(inp_file, "r")]
print("lines", len(lines))
lines = [normalize_string(line) for line in lines]
print("lines", len(lines))
with open(out_file, "w") as fout:
    for line in lines:
        fout.write(line+"\n")

